## try http:// if https:// URLs are not supported
source("https://bioconductor.org/biocLite.R")
biocLite("impute")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(impute)
library(broom)
## Warning: package 'broom' was built under R version 3.4.4
library(mice)
## Loading required package: lattice
library(modelr)
## 
## Attaching package: 'modelr'
## The following object is masked from 'package:broom':
## 
##     bootstrap
library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 2.2.1     <U+221A> readr   1.1.1
## <U+221A> tibble  1.4.2     <U+221A> purrr   0.2.4
## <U+221A> tidyr   0.8.0     <U+221A> stringr 1.3.0
## <U+221A> ggplot2 2.2.1     <U+221A> forcats 0.3.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x data.table::between() masks dplyr::between()
## x modelr::bootstrap()   masks broom::bootstrap()
## x tidyr::complete()     masks mice::complete()
## x dplyr::filter()       masks stats::filter()
## x data.table::first()   masks dplyr::first()
## x dplyr::lag()          masks stats::lag()
## x data.table::last()    masks dplyr::last()
## x purrr::transpose()    masks data.table::transpose()
library(naniar)
library(visdat)

Step 1

Read the data and select the required information

houses <- read_csv("Melbourne_housing_FULL.csv") %>%
  select(Price, Rooms, Type, Distance, Bedroom2, Bathroom)
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   Suburb = col_character(),
##   Address = col_character(),
##   Type = col_character(),
##   Method = col_character(),
##   SellerG = col_character(),
##   Date = col_character(),
##   Distance = col_double(),
##   CouncilArea = col_character(),
##   Lattitude = col_double(),
##   Longtitude = col_double(),
##   Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col     row col          expected               actual file                    expected   <int> <chr>        <chr>                  <chr>  <chr>                   actual 1 12094 BuildingArea no trailing characters .3     'Melbourne_housing_FUL~ file 2 12096 BuildingArea no trailing characters .33    'Melbourne_housing_FUL~ row 3 12139 BuildingArea no trailing characters .23    'Melbourne_housing_FUL~ col 4 12223 BuildingArea no trailing characters .51    'Melbourne_housing_FUL~ expected 5 12252 BuildingArea no trailing characters .3     'Melbourne_housing_FUL~
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.

Step 2

Summary the situation of missing value

vis_dat(houses, palette = "cb_safe")

vis_miss(houses, sort_miss = TRUE)+theme(aspect.ratio=1)

According to the chart we foud out that the missing value account for 11.5% of the total data. Onlt three variables (Bathroom, Bedroom2, Price) have missing values.

miss_summary(houses)
## # A tibble: 1 x 7
##   miss_df_prop miss_var_prop miss_case_prop miss_case_table miss_var_table
##          <dbl>         <dbl>          <dbl> <list>          <list>        
## 1        0.115         0.667          0.403 <tibble [4 x 3~ <tibble [5 x ~
## # ... with 2 more variables: miss_var_summary <list>,
## #   miss_case_summary <list>

Although the missings are small in number (11.5%), there are a huge proportion of missing variables (66.7%). Hence we cannot simply drop the missing values. Imputing the missings is necessary.

Step 3

Dealing with missing values and data mistakes

rooms_calsulation <-  houses %>%
  mutate(Trooms = Bedroom2 + Bathroom)

rooms_changing <- rooms_calsulation %>%
  mutate(Rooms = if_else(Rooms < !is.na(Trooms),Trooms, Rooms))

new_houses1 <- rooms_changing %>%
  select(Price, Rooms, Type, Distance, Bedroom2, Bathroom)

We found that the number of total rooms is not equal to the sum of bathrooms and bedrooms in some cases. So we fixed this problem by replacing the existing data with the sum of number of bathrooms and bedrooms.

names(new_houses1)[2] <- "TotalRooms"
names(new_houses1)[5] <- "Bedrooms"

imputed_houses <- new_houses1 %>%
  filter(!is.na(Price)) %>%
  mutate(
    Distance = if_else(
      is.na(Distance), mean(Distance, na.rm = TRUE), Distance),
    Bedrooms = if_else(
      is.na(Bedrooms), as.integer(median(Bedrooms, na.rm = TRUE)), Bedrooms    ),
    Bathroom = if_else(
      is.na(Bathroom), as.integer(median(Bathroom, na.rm = TRUE)), Bathroom    ),
    TotalRooms = if_else(is.na(TotalRooms),Bathroom+Bedrooms,TotalRooms),
    Type = sub("u", "0", Type),
    Type = sub("t", "1", Type),
    Type = sub("h", "2", Type)  )
  colSums(is.na(imputed_houses))
##      Price TotalRooms       Type   Distance   Bedrooms   Bathroom 
##          0          0          0          0          0          0

As we will create a regression for predicting Price useing this dataset, so we drop the missings in Price and impute the missing values in Distance, Bedrooms, Bathroom and TotalRooms. Additionally, Type is changed from a catorgorical vairbale to a numberical variable for regression line prediction. In the final code, we use “colSums” function to make sure there are no missing values in the dataset.

Step 4

Anaylsing the data by plots

imputed_houses$Price <- as.numeric(imputed_houses$Price)
imputed_houses$TotalRooms <- as.numeric(imputed_houses$TotalRooms)
imputed_houses$Type <- as.integer(imputed_houses$Type)
ggplot(data = imputed_houses,aes(x=TotalRooms,y=Price))+
  geom_smooth()
## `geom_smooth()` using method = 'gam'

ggplot(data = imputed_houses,aes(x=TotalRooms,y=log(Price)))+
  geom_smooth()
## `geom_smooth()` using method = 'gam'

This plot shows that the number of total rooms strongly effect the price. We also tested the correlation between total rooms and logarithmic price in order to create our regression.

ggplot(imputed_houses)+
  geom_smooth(aes(x = Distance,y = log(Price)), se = FALSE,color = "orange")+
  facet_wrap(~Type)
## `geom_smooth()` using method = 'gam'

ggplot(imputed_houses)+
  geom_smooth(aes(x = Distance,y = Price), se = FALSE, color = "orange")+
  facet_wrap(~Type)
## `geom_smooth()` using method = 'gam'

This graph indicates that the distance has a negative impact on the price. We also tested the correlation between distance and logarithmic price in order to create our final regression.

ggplot(imputed_houses,aes(x=Bedrooms,y=Price))+
  geom_smooth(color="red")
## `geom_smooth()` using method = 'gam'

ggplot(imputed_houses,aes(x=Bedrooms,y=log(Price)))+
  geom_smooth(color="red")
## `geom_smooth()` using method = 'gam'

According to this graph, we can that the number of bedrooms can impact the final price of houses. We also tested the correlation between the number of bedrooms and logarithmic price in order to create our final regression.

ggplot(imputed_houses,aes(x=Bathroom,y=Price))+
  geom_smooth(color="purple")
## `geom_smooth()` using method = 'gam'

ggplot(imputed_houses,aes(x=Bathroom,y=log(Price)))+
  geom_smooth(color="purple")
## `geom_smooth()` using method = 'gam'

According to this graph, we can that the number of bathrooms can effect the final price of houses. Additionally, We detected the correlation between the number of bedrooms and logarithmic price in order to create our final regression.

tbg1 <- filter(imputed_houses, Bathroom <= 2 & TotalRooms <= 8)
ggplot(tbg1, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

tbg2 <- filter(imputed_houses, Bathroom == 3 & TotalRooms <= 8)
ggplot(tbg2, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'

tbg3 <- filter(imputed_houses, Bathroom == 4 & TotalRooms <= 8)
ggplot(tbg3, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'

tbg4 <- filter(imputed_houses, Bathroom == 5 & TotalRooms <= 8)
ggplot(tbg4, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'

tbg5 <- filter(imputed_houses, Bathroom == 6 & TotalRooms <= 8)
ggplot(tbg5, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'

tbg6 <- filter(imputed_houses, Bathroom == 7 & TotalRooms <= 8)
ggplot(tbg6, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0065
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 11.007
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)

tbg7 <- filter(imputed_houses, Bathroom == 8 & TotalRooms <= 8)
ggplot(tbg7, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 8.8055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)

ggplot(imputed_houses)+
  geom_hex(aes(x = Distance,y = log(Price)))+
  facet_wrap(~Type)

ggplot(imputed_houses)+
  geom_hex(aes(x = Distance,y = Price))+
  facet_wrap(~Type)

imputed_houses <- mutate(imputed_houses, logprice = log(Price))
tbg1 <- filter(imputed_houses, Bathroom <= 2 & TotalRooms <= 8)
ggplot(tbg1, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

tbg2 <- filter(imputed_houses, Bathroom == 3 & TotalRooms <= 8)
ggplot(tbg2, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'

tbg3 <- filter(imputed_houses, Bathroom == 4 & TotalRooms <= 8)
ggplot(tbg3, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'

tbg4 <- filter(imputed_houses, Bathroom == 5 & TotalRooms <= 8)
ggplot(tbg4, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'

tbg5 <- filter(imputed_houses, Bathroom == 6 & TotalRooms <= 8)
ggplot(tbg5, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'

tbg6 <- filter(imputed_houses, Bathroom == 7 & TotalRooms <= 8)
ggplot(tbg6, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0065
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 11.007
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)

tbg7 <- filter(imputed_houses, Bathroom == 8 & TotalRooms <= 8)
ggplot(tbg7, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 8.8055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)

ggplot(imputed_houses)+
  geom_hex(aes(x = Distance,y = log(Price)))+
  facet_wrap(~Type)

ggplot(imputed_houses)+
  geom_hex(aes(x = Distance,y = Price))+
  facet_wrap(~Type)

imputed_houses <- imputed_houses %>%
  mutate(Type = sub("u", "0", Type),
         Type = sub("t", "1", Type),
         Type = sub("h", "2", Type))

Step 5

Establish the regression

sim1 <- lm(Price ~ TotalRooms, data=imputed_houses)
sim2 <- lm(Price ~ Distance, data=imputed_houses)
sim3 <- lm(Price ~ Bedrooms, data=imputed_houses)
sim4 <- lm(Price ~ Bathroom, data=imputed_houses)
sim5 <- lm(Price ~ Type, data=imputed_houses)
glance(sim1)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2164467      0.216418 567827.8  7526.088       0  2 -399671.9 799349.9
##        BIC     deviance df.residual
## 1 799374.5 8.784562e+15       27245
glance(sim2)
##    r.squared adj.r.squared    sigma statistic       p.value df    logLik
## 1 0.04468259    0.04464753 626983.7  1274.317 8.333567e-273  2 -402372.2
##        AIC    BIC     deviance df.residual
## 1 804750.4 804775 1.071024e+16       27245
glance(sim3)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.1483859     0.1483546 591975.6  4747.189       0  2 -400806.7 801619.4
##        BIC     deviance df.residual
## 1 801644.1 9.547605e+15       27245
glance(sim4)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.1566628     0.1566319 589091.8  5061.176       0  2 -400673.7 801353.3
##        BIC    deviance df.residual
## 1 801377.9 9.45481e+15       27245
glance(sim5)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.1364347     0.1363713 596125.8  2152.139       0  3 -400996.6 802001.1
##      BIC     deviance df.residual
## 1 802034 9.681592e+15       27244

The first model type we tried is single factor model, we will see the relation ship between price and different factors, seems like in single factor model, Distance model(sim2) has the lowest R^2, which mean distance is the worst variables to fit the pridicting price.

sim11 <- lm(Price ~ TotalRooms+Distance, data=imputed_houses)
sim12 <- lm(Price ~ TotalRooms*Distance, data=imputed_houses)
sim13 <- lm(Price ~ TotalRooms+Bedrooms, data=imputed_houses)
sim14 <- lm(Price ~ TotalRooms*Bedrooms, data=imputed_houses)
sim15 <- lm(Price ~ TotalRooms+Bathroom, data=imputed_houses)
sim16 <- lm(Price ~ TotalRooms*Bathroom, data=imputed_houses)
sim17 <- lm(Price ~ TotalRooms+Type, data=imputed_houses)
sim18 <- lm(Price ~ TotalRooms*Type, data=imputed_houses)

glance(sim11)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.3449257     0.3448776 519201.1  7172.587       0  3 -397232.1 794472.2
##        BIC    deviance df.residual
## 1 794505.1 7.34416e+15       27244
glance(sim12)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.3691967     0.3691272 509501.3  5314.929       0  4 -396717.8 793445.5
##        BIC     deviance df.residual
## 1 793486.6 7.072054e+15       27243
glance(sim13)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2164565      0.216399 567834.7  3763.123       0  3 -399671.8 799351.6
##        BIC     deviance df.residual
## 1 799384.4 8.784452e+15       27244
glance(sim14)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2164565     0.2163702 567845.1  2508.657       0  4 -399671.8 799353.6
##        BIC     deviance df.residual
## 1 799394.6 8.784452e+15       27243
glance(sim15)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2466267     0.2465714 556795.2  4459.342       0  3 -399136.8 798281.7
##        BIC     deviance df.residual
## 1 798314.5 8.446208e+15       27244
glance(sim16)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2466318     0.2465489 556803.5  2972.867       0  4 -399136.7 798283.5
##        BIC     deviance df.residual
## 1 798324.6 8.446151e+15       27243
glance(sim17)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2350628     0.2349786 561062.5  2790.563       0  4 -399344.4 798698.7
##        BIC     deviance df.residual
## 1 798739.8 8.575853e+15       27243
glance(sim18)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2351641     0.2350237 561045.9  1675.158       0  6 -399342.6 798699.1
##        BIC     deviance df.residual
## 1 798756.6 8.574718e+15       27241

By adding another factor, the adjusted r^2 of model is significantly improved, and for total rooms and distance, the model with interaction has higher adjusted r^2, therefore total rooms will be times by distance in our final model

sim19 <- lm(Price ~ Distance+Bedrooms, data=imputed_houses)
sim20 <- lm(Price ~ Distance*Bedrooms, data=imputed_houses)
sim21 <- lm(Price ~ Distance+Bathroom, data=imputed_houses)
sim22 <- lm(Price ~ Distance*Bathroom, data=imputed_houses)
sim23 <- lm(Price ~ Distance+Type, data=imputed_houses)
sim24 <- lm(Price ~ Distance*Type, data=imputed_houses)

glance(sim19)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2497995     0.2497444 555621.5  4535.813       0  3 -399079.3 798166.7
##        BIC     deviance df.residual
## 1 798199.5 8.410637e+15       27244
glance(sim20)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2661513     0.2660705 549542.9  3293.485       0  4 -398779.1 797568.2
##        BIC     deviance df.residual
## 1 797609.3 8.227315e+15       27243
glance(sim21)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2258794     0.2258226 564409.9  3974.742       0  3 -399506.9 799021.9
##        BIC    deviance df.residual
## 1 799054.7 8.67881e+15       27244
glance(sim22)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2504733     0.2503908 555382.1  3034.646       0  4 -399067.1 798144.2
##        BIC     deviance df.residual
## 1 798185.3 8.403083e+15       27243
glance(sim23)
##   r.squared adj.r.squared  sigma statistic p.value df    logLik      AIC
## 1 0.2336836     0.2335992 561568  2769.196       0  4 -399368.9 798747.8
##        BIC     deviance df.residual
## 1 798788.9 8.591316e+15       27243
glance(sim24)
##   r.squared adj.r.squared  sigma statistic p.value df    logLik      AIC
## 1 0.2505556      0.250418 555372  1821.452       0  6 -399065.6 798145.2
##        BIC     deviance df.residual
## 1 798202.7 8.402161e+15       27241

In theses 6 models, models with interaction shown a higher adjusted r^2 compared with models without interaction. Therefore, in final model, distance will have interaction with other factors

sim25 <- lm(Price ~ Bedrooms+Bathroom, data=imputed_houses)
sim26 <- lm(Price ~ Bedrooms*Bathroom, data=imputed_houses)
sim27 <- lm(Price ~ Bedrooms+Type, data=imputed_houses)
sim28 <- lm(Price ~ Bedrooms*Type, data=imputed_houses)
sim29 <- lm(Price ~ Bathroom+Type, data=imputed_houses)
sim30 <- lm(Price ~ Bathroom*Type, data=imputed_houses)

glance(sim25)
##   r.squared adj.r.squared    sigma statistic p.value df  logLik      AIC
## 1 0.1945787     0.1945195 575707.5  3290.887       0  3 -400047 800101.9
##        BIC     deviance df.residual
## 1 800134.8 9.029729e+15       27244
glance(sim26)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.1945807      0.194492 575717.4  2193.872       0  4 -400046.9 800103.8
##        BIC     deviance df.residual
## 1 800144.9 9.029706e+15       27243
glance(sim27)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1  0.200891      0.200803 573457.6  2282.907       0  4 -399939.8 799889.5
##        BIC    deviance df.residual
## 1 799930.6 8.95896e+15       27243
glance(sim28)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2029494     0.2028031 572739.6  1387.251       0  6 -399904.6 799823.2
##        BIC     deviance df.residual
## 1 799880.7 8.935883e+15       27241
glance(sim29)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2314362     0.2313516 562390.9  2734.545       0  4 -399408.8 798827.6
##        BIC     deviance df.residual
## 1 798868.7 8.616512e+15       27243
glance(sim30)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2335036     0.2333629 561654.6  1659.727       0  6 -399372.1 798758.2
##        BIC     deviance df.residual
## 1 798815.7 8.593334e+15       27241

In these 6 models, there is not significant difference between model with interaction and model without interaction, therefore there will not be interaction between these factors in the final model

sim31 <- lm(Price ~ Distance*TotalRooms+Distance*Bedrooms+Distance*Bathroom+Distance*Type, data=imputed_houses)
sim32 <- lm(Price ~ Distance*TotalRooms*Bedrooms*Bathroom*Type, data=imputed_houses)
sim33 <- lm(Price ~ Distance*TotalRooms*(Bedrooms+Bathroom)*Type, data=imputed_houses)
sim34 <- lm(log(Price) ~ Distance*TotalRooms*Bedrooms*Bathroom*Type, data=imputed_houses)

glance(sim31)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4347131     0.4344848 482388.1  1904.007       0 12 -395223.8 790473.6
##        BIC     deviance df.residual
## 1 790580.4 6.337537e+15       27235
glance(sim32)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4452829     0.4443243 478173.1  464.5361       0 48 -394966.7 790031.3
##        BIC     deviance df.residual
## 1 790433.7 6.219037e+15       27199
glance(sim33)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4424147     0.4416975 479301.9  616.8716       0 36 -395036.9 790147.8
##        BIC     deviance df.residual
## 1 790451.7 6.251192e+15       27211
glance(sim34)
##   r.squared adj.r.squared     sigma statistic p.value df    logLik
## 1 0.5425566     0.5417662 0.3499522  686.3772       0 48 -10029.57
##        AIC      BIC deviance df.residual
## 1 20157.14 20559.56 3330.967       27199

Based on the results above, the sim34 has the largest adjusted R-squared which is 0.539. Sim34 is the best regression of price.

Step 6

Summary the final model

summary(sim34)
## 
## Call:
## lm(formula = log(Price) ~ Distance * TotalRooms * Bedrooms * 
##     Bathroom * Type, data = imputed_houses)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.22443 -0.24485 -0.01407  0.22292  2.47430 
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                 13.805721   0.415854  33.198
## Distance                                    -0.215951   0.043955  -4.913
## TotalRooms                                  -0.125527   0.204015  -0.615
## Bedrooms                                    -0.896634   0.182081  -4.924
## Bathroom                                    -0.880427   0.392874  -2.241
## Type1                                        0.124199   0.791999   0.157
## Type2                                       -0.235680   0.426790  -0.552
## Distance:TotalRooms                          0.076549   0.019148   3.998
## Distance:Bedrooms                            0.108199   0.020943   5.166
## TotalRooms:Bedrooms                          0.322259   0.063045   5.112
## Distance:Bathroom                            0.133684   0.039717   3.366
## TotalRooms:Bathroom                          0.216376   0.184870   1.170
## Bedrooms:Bathroom                            0.766044   0.173623   4.412
## Distance:Type1                               0.065623   0.079324   0.827
## Distance:Type2                               0.171420   0.044664   3.838
## TotalRooms:Type1                             0.034648   0.380441   0.091
## TotalRooms:Type2                             0.279545   0.206813   1.352
## Bedrooms:Type1                               0.503800   0.331563   1.519
## Bedrooms:Type2                               0.851587   0.186105   4.576
## Bathroom:Type1                               0.374593   0.623293   0.601
## Bathroom:Type2                               0.905942   0.396624   2.284
## Distance:TotalRooms:Bedrooms                -0.039407   0.006229  -6.327
## Distance:TotalRooms:Bathroom                -0.034351   0.015929  -2.157
## Distance:Bedrooms:Bathroom                  -0.077498   0.019673  -3.939
## TotalRooms:Bedrooms:Bathroom                -0.208657   0.053812  -3.878
## Distance:TotalRooms:Type1                   -0.019786   0.035600  -0.556
## Distance:TotalRooms:Type2                   -0.069262   0.019319  -3.585
## Distance:Bedrooms:Type1                     -0.066311   0.033326  -1.990
## Distance:Bedrooms:Type2                     -0.103033   0.021179  -4.865
## TotalRooms:Bedrooms:Type1                   -0.195566   0.109480  -1.786
## TotalRooms:Bedrooms:Type2                   -0.314213   0.063555  -4.944
## Distance:Bathroom:Type1                     -0.073268   0.062229  -1.177
## Distance:Bathroom:Type2                     -0.131473   0.039886  -3.296
## TotalRooms:Bathroom:Type1                    0.016310   0.291169   0.056
## TotalRooms:Bathroom:Type2                   -0.148783   0.186069  -0.800
## Bedrooms:Bathroom:Type1                     -0.472624   0.284374  -1.662
## Bedrooms:Bathroom:Type2                     -0.749141   0.174711  -4.288
## Distance:TotalRooms:Bedrooms:Bathroom        0.021604   0.004886   4.422
## Distance:TotalRooms:Bedrooms:Type1           0.022594   0.010471   2.158
## Distance:TotalRooms:Bedrooms:Type2           0.036933   0.006262   5.898
## Distance:TotalRooms:Bathroom:Type1           0.006793   0.026421   0.257
## Distance:TotalRooms:Bathroom:Type2           0.030979   0.016001   1.936
## Distance:Bedrooms:Bathroom:Type1             0.054173   0.028550   1.897
## Distance:Bedrooms:Bathroom:Type2             0.076970   0.019724   3.902
## TotalRooms:Bedrooms:Bathroom:Type1           0.109128   0.076302   1.430
## TotalRooms:Bedrooms:Bathroom:Type2           0.198599   0.053860   3.687
## Distance:TotalRooms:Bedrooms:Bathroom:Type1 -0.012144   0.007054  -1.722
## Distance:TotalRooms:Bedrooms:Bathroom:Type2 -0.020939   0.004888  -4.284
##                                             Pr(>|t|)    
## (Intercept)                                  < 2e-16 ***
## Distance                                    9.02e-07 ***
## TotalRooms                                  0.538373    
## Bedrooms                                    8.51e-07 ***
## Bathroom                                    0.025035 *  
## Type1                                       0.875390    
## Type2                                       0.580804    
## Distance:TotalRooms                         6.41e-05 ***
## Distance:Bedrooms                           2.40e-07 ***
## TotalRooms:Bedrooms                         3.22e-07 ***
## Distance:Bathroom                           0.000764 ***
## TotalRooms:Bathroom                         0.241840    
## Bedrooms:Bathroom                           1.03e-05 ***
## Distance:Type1                              0.408086    
## Distance:Type2                              0.000124 ***
## TotalRooms:Type1                            0.927435    
## TotalRooms:Type2                            0.176490    
## Bedrooms:Type1                              0.128656    
## Bedrooms:Type2                              4.76e-06 ***
## Bathroom:Type1                              0.547852    
## Bathroom:Type2                              0.022371 *  
## Distance:TotalRooms:Bedrooms                2.54e-10 ***
## Distance:TotalRooms:Bathroom                0.031052 *  
## Distance:Bedrooms:Bathroom                  8.19e-05 ***
## TotalRooms:Bedrooms:Bathroom                0.000106 ***
## Distance:TotalRooms:Type1                   0.578360    
## Distance:TotalRooms:Type2                   0.000337 ***
## Distance:Bedrooms:Type1                     0.046628 *  
## Distance:Bedrooms:Type2                     1.15e-06 ***
## TotalRooms:Bedrooms:Type1                   0.074060 .  
## TotalRooms:Bedrooms:Type2                   7.70e-07 ***
## Distance:Bathroom:Type1                     0.239049    
## Distance:Bathroom:Type2                     0.000981 ***
## TotalRooms:Bathroom:Type1                   0.955330    
## TotalRooms:Bathroom:Type2                   0.423941    
## Bedrooms:Bathroom:Type1                     0.096528 .  
## Bedrooms:Bathroom:Type2                     1.81e-05 ***
## Distance:TotalRooms:Bedrooms:Bathroom       9.83e-06 ***
## Distance:TotalRooms:Bedrooms:Type1          0.030948 *  
## Distance:TotalRooms:Bedrooms:Type2          3.72e-09 ***
## Distance:TotalRooms:Bathroom:Type1          0.797101    
## Distance:TotalRooms:Bathroom:Type2          0.052862 .  
## Distance:Bedrooms:Bathroom:Type1            0.057775 .  
## Distance:Bedrooms:Bathroom:Type2            9.55e-05 ***
## TotalRooms:Bedrooms:Bathroom:Type1          0.152664    
## TotalRooms:Bedrooms:Bathroom:Type2          0.000227 ***
## Distance:TotalRooms:Bedrooms:Bathroom:Type1 0.085159 .  
## Distance:TotalRooms:Bedrooms:Bathroom:Type2 1.85e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.35 on 27199 degrees of freedom
## Multiple R-squared:  0.5426, Adjusted R-squared:  0.5418 
## F-statistic: 686.4 on 47 and 27199 DF,  p-value: < 2.2e-16

In “step 4” we have proved that all 5 variable have significant effect on price, so we do not need to drop any variables in our final model. Furthermore, in “step 5” we tried lots of formula to find out the best equation of the regression. Finally, the sim34 comes out to be the best model which has the largest adjusted R-squared.

Step 7

Predicting the Price

imputed_houses <- add_predictions(imputed_houses, sim34)
tbg21 <- filter(imputed_houses, Bathroom <= 2 & TotalRooms <= 8)
ggplot(tbg21, mapping = aes(x = Distance, color = TotalRooms))+
  geom_line(aes(y = pred), colour = "red")+
  geom_point(aes(y = logprice))+
  facet_grid(Bathroom ~ Type)

imputed_houses <- add_residuals(imputed_houses, sim34)

ggplot(imputed_houses, aes(resid))+
  geom_freqpoly(binwidth = 0.5)

ggplot(imputed_houses, aes(logprice, resid))+
  geom_ref_line(h = 0)+
  geom_point()

These plots are from data that filled in the pridicted price.

rooms_calsulation <-  houses %>%
  mutate(special = Bedroom2 + Bathroom)

rooms_changing <- rooms_calsulation %>%
  mutate(TotalRooms = if_else(Rooms < !is.na(special),special, Rooms))

houses3 <-  rooms_changing %>%
  select(Price, TotalRooms, Type, Distance, Bedroom2, Bathroom)


houses4 <- houses3 %>%
  mutate(
    Distance = if_else(
      is.na(Distance), mean(Distance, na.rm = TRUE), Distance),
    Bedrooms = if_else(
      is.na(Bedroom2), as.integer(median(Bedroom2, na.rm = TRUE)), Bedroom2    ),
    Bathroom = if_else(
      is.na(Bathroom), as.integer(median(Bathroom, na.rm = TRUE)), Bathroom
    ))
  
  md.pattern(houses4)
## Warning in data.matrix(x): NAs introduced by coercion
##       TotalRooms Distance Bathroom Bedrooms Price Bedroom2  Type      
## 20806          1        1        1        1     1        1     0     1
##  5834          1        1        1        1     0        1     0     2
##  6441          1        1        1        1     1        0     0     2
##  1776          1        1        1        1     0        0     0     3
##                0        0        0        0  7610     8217 34857 50684

We run the code again to make sure there is no missings in the prdicted results.

houses4 <- houses4 %>%
  mutate(Type = sub("0", "unit_houses", Type),
         Type = sub("1", "town_houses", Type),
         Type = sub("2", "houses", Type))
tbg21 <- filter(houses4, Bathroom <= 2 & TotalRooms <= 8)
ggplot(tbg21, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 6830 rows containing non-finite values (stat_smooth).
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Warning: Removed 6830 rows containing missing values (geom_point).

tbg22 <- filter(houses4, Bathroom == 3 & TotalRooms <= 8)
ggplot(tbg22, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 643 rows containing non-finite values (stat_smooth).
## Warning: Removed 643 rows containing missing values (geom_point).

tbg23 <- filter(houses4, Bathroom == 4 & TotalRooms <= 8)
ggplot(tbg23, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 92 rows containing non-finite values (stat_smooth).
## Warning: Removed 92 rows containing missing values (geom_point).

tbg24 <- filter(houses4, Bathroom == 5 & TotalRooms <= 8)
ggplot(tbg24, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 31 rows containing non-finite values (stat_smooth).
## Warning: Removed 31 rows containing missing values (geom_point).

tbg25 <- filter(houses4, Bathroom == 6 & TotalRooms <= 8)
ggplot(tbg25, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).

tbg26 <- filter(houses4, Bathroom == 7 & TotalRooms <= 8)
ggplot(tbg26, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0065
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 11.007
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 2 rows containing missing values (geom_point).

tbg27 <- filter(houses4, Bathroom == 8 & TotalRooms <= 8)
ggplot(tbg27, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 8.8055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)

tbg28 <- filter(houses4, Bathroom >= 8 & TotalRooms <= 8)
ggplot(tbg28, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 8.8055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 1 rows containing missing values (geom_point).

tbg29 <- filter(houses4, TotalRooms >= 8)
ggplot(tbg29, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
  geom_point()+
  geom_smooth(se = FALSE)+
  facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 10 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 2.574
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 0.000676
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2.574
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.026
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.826
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 0.000676
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.000676
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 10.365
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.735
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 45.36
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 4

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 4
## Warning in sqrt(sum.squares/one.delta): NaNs produced
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 10.47
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 3.33
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 7.4529
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6525
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 0.0022563
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6525
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0475
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 17.247
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 0.0022562
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.0022562
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 10 rows containing missing values (geom_point).

ggplot(houses4)+
  geom_hex(aes(x = Distance,y = log(Price)))+
  facet_wrap(~Type)
## Warning: Removed 7610 rows containing non-finite values (stat_binhex).

ggplot(houses4)+
  geom_hex(aes(x = Distance,y = Price))+
  facet_wrap(~Type)
## Warning: Removed 7610 rows containing non-finite values (stat_binhex).

We make plots for the pridicted data.